home *** CD-ROM | disk | FTP | other *** search
- unit Scanner;
-
- interface
-
- uses
- Classes, DB, DBTables;
-
- const
- MaxBufferSize = 1024;
-
- type
- TMemoScanner = class(TBLOBStream)
- private
- Buffer: array[1..MaxBufferSize] of Char;
- Punctuation: string;
- WhiteSpace: string;
- DiscardWords: TStringList;
- protected
- FKeywords: TStringList;
- procedure DefineDiscardWords(aList: TStrings); virtual;
- function DefinePunctuation: string; virtual;
- function DefineWhitespace: string; virtual;
- function GetKeyword(aIndex: Integer): string;
- function GetKeywordCount: Integer;
- procedure KeywordFound(aKeyword: string; aWordOffset: Integer); virtual;
- public
- constructor Create(aField: TBLOBField);
- destructor Destroy; override;
- procedure Scan;
- property KeywordCount: Integer read GetKeywordCount;
- property Keywords[aIndex: Integer]: string read GetKeyword;
- end;
-
- TMemoScannerExt = class(TMemoScanner)
- protected
- procedure DefineDiscardWords(aList: TStrings); override;
- function GetWordOffset(aIndex: Integer): Integer;
- procedure KeywordFound(aKeyword: string; aWordOffset: Integer); override;
- public
- constructor Create(aField: TBLOBField);
- property WordOffset[aIndex: Integer]: Integer read GetWordOffset;
- end;
-
- implementation
-
- { TMemoScanner }
-
- constructor TMemoScanner.Create(aField: TBLOBField);
- begin
- inherited Create(aField, bmRead);
- FKeywords := TStringList.Create;
- FKeywords.Sorted := True;
- FKeywords.Duplicates := dupIgnore;
-
- DiscardWords := TStringList.Create;
- DiscardWords.Sorted := True;
- DiscardWords.Duplicates := dupIgnore;
- DefineDiscardWords(DiscardWords);
- Punctuation := DefinePunctuation;
- WhiteSpace := DefineWhiteSpace;
- end;
-
- destructor TMemoScanner.Destroy;
- begin
- DiscardWords.Free;
- inherited Destroy;
- end;
-
- procedure TMemoScanner.DefineDiscardWords(aList: TStrings);
- begin
- { There are various methods for implementing the lookup
- list. A hash table might be faster. }
- with aList do begin
- Add('A');
- Add('ALL');
- Add('AN');
- Add('AND');
- Add('ARE');
- Add('AS');
- Add('BY');
- Add('DO');
- Add('I');
- Add('IN');
- Add('NOT');
- Add('OF');
- Add('THE');
- Add('THEN');
- Add('TO');
- Add('WITH');
- end;
- end;
-
- function TMemoScanner.DefinePunctuation: string;
- begin
- { we specifically omit the hyphen and apostrophe }
- Result := '~`!@#$%^&*()+={}[]|\:;"<>,.?/';
- end;
-
- function TMemoScanner.DefineWhiteSpace: string;
- begin
- Result := #32#8#9#13#10;
- end;
-
- function TMemoScanner.GetKeyword(aIndex: Integer): string;
- begin
- Result := FKeywords[aIndex];
- end;
-
- function TMemoScanner.GetKeywordCount: Integer;
- begin
- Result := FKeywords.Count;
- end;
-
- procedure TMemoScanner.KeywordFound(aKeyword: string; aWordOffset: Integer);
- begin
- FKeywords.Add(aKeyword);
- end;
-
- procedure TMemoScanner.Scan;
- var
- Ch: Char;
- Keyword: string;
- I: Integer;
- BufLen: Integer;
- WordOffset: Integer;
- procedure AddKeyword;
- begin
- if Length(Keyword) <> 0 then begin
- Inc(WordOffset); { count words in text }
- if DiscardWords.IndexOf(Keyword) = -1 then
- KeywordFound(Keyword, WordOffset);
- Keyword := '';
- end;
- end;
- begin
- FKeywords.Clear;
- Keyword := '';
- Position := 0;
- WordOffset := -1;
- while Position < Size do begin
- BufLen := Read(Buffer, SizeOf(Buffer));
- for I := 1 to BufLen do begin
- Ch := UpCase(Buffer[I]);
- { Is it a keyword delimiter? }
- if Pos(Ch, WhiteSpace + Punctuation) <> 0 then
- AddKeyword
- else {accumulate current keyword}
- Keyword := Keyword + Ch;
- end;
- end;
- AddKeyword;
- end;
-
- { TMemoScannerExt }
-
- constructor TMemoScannerExt.Create(aField: TBLOBField);
- begin
- inherited Create(aField);
- FKeywords.Duplicates := dupAccept;
- end;
-
- procedure TMemoScannerExt.DefineDiscardWords(aList: TStrings);
- begin
- { no discard words }
- end;
-
- function TMemoScannerExt.GetWordOffset(aIndex: Integer): Integer;
- begin
- Result := Integer(FKeywords.Objects[aIndex]);
- end;
-
- procedure TMemoScannerExt.KeywordFound(aKeyword: string; aWordOffset: Integer);
- begin
- FKeywords.AddObject(aKeyword, Pointer(aWordOffset));
- end;
-
- end.
-
-